資料分析1

林嶔 (Lin, Chin)

Lesson 5

第一節:描述性統計(1)

– 請至這裡下載本週的範例資料

dat = read.csv("Example data.csv", header = TRUE)
head(dat)
##       eGFR Disease Survival.time Death Diabetes Cancer      SBP      DBP
## 1 34.65379       1     0.4771037     0        0      1 121.2353 121.3079
## 2 37.21183       1     3.0704424     0        1      1 122.2000 122.6283
## 3 32.60074       1     0.2607117     1        0      0 118.9136 121.7621
## 4 29.68481       1            NA    NA        0      0 118.2212 112.7043
## 5 28.35726       0     0.1681673     1        0      0 116.7469 115.7705
## 6 33.95012       1     1.2238556     0        0      0 119.9936 116.3872
##   Education Income
## 1         2      0
## 2         2      0
## 3         0      0
## 4         1      0
## 5         0      0
## 6         1      0
  1. 函數「mean()」可以幫助我們計算平均值
mean(dat$eGFR, na.rm = TRUE)
## [1] 34.14115
  1. 函數「sd()」可以幫助我們計算標準差
sd(dat$eGFR, na.rm = TRUE)
## [1] 7.062506
  1. 函數「var()」可以幫助我們計算變異數
var(dat$eGFR, na.rm = TRUE)
## [1] 49.87899
  1. 函數「median()」可以幫助我們計算變異數
median(dat$eGFR, na.rm = TRUE)
## [1] 34.33123
  1. 函數「quantile()」可以幫助我們計算百分位數
quantile(dat$eGFR, na.rm = TRUE)
##       0%      25%      50%      75%     100% 
## 10.00000 29.58404 34.33123 38.44965 60.00000
quantile(dat$eGFR, 0.5, na.rm = TRUE)
##      50% 
## 34.33123
quantile(dat$eGFR, 0.95, na.rm = TRUE)
##      95% 
## 45.86468
  1. 函數「min()」可以幫助我們找出最小值
min(dat$eGFR, na.rm = TRUE)
## [1] 10
  1. 函數「max()」可以幫助我們找出最大值
max(dat$eGFR, na.rm = TRUE)
## [1] 60
  1. 函數「table()」可以幫助我們產生列聯表
table(dat$Cancer)
## 
##   0   1 
## 576 402
table(dat$Cancer, dat$Diabetes)
##    
##       0   1
##   0 468  97
##   1 309  81
  1. 函數「prop.table()」可以幫助我們產生列聯表的百分比
tab1 = table(dat$Cancer)
prop.table(tab1)
## 
##         0         1 
## 0.5889571 0.4110429
tab2 = table(dat$Cancer, dat$Diabetes)
prop.table(tab2)
##    
##              0          1
##   0 0.49005236 0.10157068
##   1 0.32356021 0.08481675
prop.table(tab2, 1)
##    
##             0         1
##   0 0.8283186 0.1716814
##   1 0.7923077 0.2076923
prop.table(tab2, 2)
##    
##             0         1
##   0 0.6023166 0.5449438
##   1 0.3976834 0.4550562

第一節:描述性統計(2)

mean(dat[dat[,"Disease"] == 1,]$eGFR, na.rm = TRUE)
## [1] 34.41452
mean(dat[dat[,"Disease"] == 0,]$eGFR, na.rm = TRUE)
## [1] 33.19008
paste(mean(dat[dat[,"Disease"] == 1,]$eGFR, na.rm = TRUE), "±", sd(dat[dat[,"Disease"] == 1,]$eGFR, na.rm = TRUE), sep = "")
## [1] "34.4145222034604±7.1582590897065"
paste0(mean(dat[dat[,"Disease"] == 1,]$eGFR, na.rm = TRUE), "±", sd(dat[dat[,"Disease"] == 1,]$eGFR, na.rm = TRUE))
## [1] "34.4145222034604±7.1582590897065"
m = mean(dat[dat[,"Disease"] == 1,]$eGFR, na.rm = TRUE)
s = sd(dat[dat[,"Disease"] == 1,]$eGFR, na.rm = TRUE)
formatC(m, digits = 3, format = "f")
## [1] "34.415"
formatC(s, digits = 3, format = "f")
## [1] "7.158"
paste0(formatC(m, digits = 3, format = "f"), "±", formatC(s, digits = 3, format = "f"))
## [1] "34.415±7.158"

練習1:方便的自訂函數

  1. 函數之input必須為一類別變項及一連續變項

  2. 函數中必須設定一個變項,讓使用者能決定顯示小數點的位數

3-1. 第一個函數必須輸出在該類別變項為不同類別時,該連續變項之『平均數±標準差』

3-2. 第二個函數必須輸出在該類別變項為不同類別時,該連續變項之『中位數(25百分位-75百分位)』

– 注意:請考慮類別數不只是2的情形,以及類別項目並非為0、1、2、…的狀況

練習1答案

– 剩下的其實就是把前面的東西貼起來:

my_func.1 = function (x, y, dig = 3) {
  
  lvl.x = levels(factor(x))
  n.lvl.x = length(lvl.x)
  
  result = character(n.lvl.x)
  
  for (i in 1:n.lvl.x) {
    m = mean(y[x == lvl.x[i]], na.rm = TRUE)
    s = sd(y[x == lvl.x[i]], na.rm = TRUE)
    result[i] = paste0(formatC(m, digits = dig, format = "f"), "±", formatC(s, digits = dig, format = "f"))
  }
  
  result
  
}

my_func.2 = function (x, y, dig = 3) {
  
  lvl.x = levels(factor(x))
  n.lvl.x = length(lvl.x)
  
  result = character(n.lvl.x)
  
  for (i in 1:n.lvl.x) {
    m = median(y[x == lvl.x[i]], na.rm = TRUE)
    q1 = quantile(y[x == lvl.x[i]], 0.25, na.rm = TRUE)
    q2 = quantile(y[x == lvl.x[i]], 0.75, na.rm = TRUE)
    result[i] = paste0(formatC(m, digits = dig, format = "f"), "(", formatC(q1, digits = dig, format = "f"), "-", formatC(q2, digits = dig, format = "f"), ")")
  }
  
  result
  
}
my_func.1(dat$Education, dat$SBP)
## [1] "121.365±10.568" "121.406±10.920" "122.853±10.532"
my_func.2(dat$Income, dat$eGFR, dig = 1)
## [1] "34.3(29.5-38.2)" "33.8(28.9-38.8)" "34.9(30.4-40.0)"

第二節:列表(List)層物件基本介紹(1)

– 列表(List)層分為列表(list)、S3物件(S3 class)及S4物件(S4 class):

  1. 列表(list):在R裡面,向量的上層是陣列層物件。若是我們希望在一個物件內放置很多陣列層物件,我們會用到列表。値得一提的是,列表裡面可以同時包含數個陣列層物件及變數層物件。

  2. S3物件(S3 class):S3物件是一種特殊的列表物件,他的變化會在後面慢慢介紹。

  3. S4物件(S4 class):S4物件與前面兩種有非常大的不同,相關的函數也不一樣,在本節課我們不會教到。

# 先產生一個數値矩陣物件
x1 = 1:20
M1 = matrix(x1, nrow = 4, ncol = 5)
M1
##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1    5    9   13   17
## [2,]    2    6   10   14   18
## [3,]    3    7   11   15   19
## [4,]    4    8   12   16   20
# 再產生一個文字矩陣物件
x2 = c("A", "B", "C", "A", "C", "B", "B", "B", "A")
M2 = matrix(x2, nrow = 3, ncol = 3)
M2
##      [,1] [,2] [,3]
## [1,] "A"  "A"  "B" 
## [2,] "B"  "C"  "B" 
## [3,] "C"  "B"  "A"
# 再產生一個邏輯向量
x3 = c(TRUE, FALSE, TRUE, FALSE)
x3
## [1]  TRUE FALSE  TRUE FALSE
# 將上述這些物件打包成一個列表物件
L1 = list(M1, M2, x3)
L1
## [[1]]
##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1    5    9   13   17
## [2,]    2    6   10   14   18
## [3,]    3    7   11   15   19
## [4,]    4    8   12   16   20
## 
## [[2]]
##      [,1] [,2] [,3]
## [1,] "A"  "A"  "B" 
## [2,] "B"  "C"  "B" 
## [3,] "C"  "B"  "A" 
## 
## [[3]]
## [1]  TRUE FALSE  TRUE FALSE

第二節:列表(List)層物件基本介紹(2)

  1. 函數「length()」可以協助我們了解物件長度

  2. 函數「class()」可以查詢該物件的屬性

  3. 函數「names()」可以協助我們命名物件

  4. 函數「ls()」可以協助我們看看物件中有哪些東西

length(L1)
## [1] 3
class(L1)
## [1] "list"
names(L1) = c("A", "B", "C")
L1
## $A
##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1    5    9   13   17
## [2,]    2    6   10   14   18
## [3,]    3    7   11   15   19
## [4,]    4    8   12   16   20
## 
## $B
##      [,1] [,2] [,3]
## [1,] "A"  "A"  "B" 
## [2,] "B"  "C"  "B" 
## [3,] "C"  "B"  "A" 
## 
## $C
## [1]  TRUE FALSE  TRUE FALSE
ls(L1)
## [1] "A" "B" "C"

第二節:列表(List)層物件基本介紹(3)

L1[[2]]
##      [,1] [,2] [,3]
## [1,] "A"  "A"  "B" 
## [2,] "B"  "C"  "B" 
## [3,] "C"  "B"  "A"
L1[["B"]]
##      [,1] [,2] [,3]
## [1,] "A"  "A"  "B" 
## [2,] "B"  "C"  "B" 
## [3,] "C"  "B"  "A"
L1$B
##      [,1] [,2] [,3]
## [1,] "A"  "A"  "B" 
## [2,] "B"  "C"  "B" 
## [3,] "C"  "B"  "A"
L1[[2]][2,3]
## [1] "B"
L1[["B"]][3,1]
## [1] "C"
L1$B[1,2]
## [1] "A"

第二節:列表(List)層物件基本介紹(4)

#先看看L1的樣子
L1
## $A
##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1    5    9   13   17
## [2,]    2    6   10   14   18
## [3,]    3    7   11   15   19
## [4,]    4    8   12   16   20
## 
## $B
##      [,1] [,2] [,3]
## [1,] "A"  "A"  "B" 
## [2,] "B"  "C"  "B" 
## [3,] "C"  "B"  "A" 
## 
## $C
## [1]  TRUE FALSE  TRUE FALSE
#先看看L1的物件屬性
class(L1)
## [1] "list"
#強迫L1成為別的物件屬性
class(L1) = "test"
#再看看L1的物件屬性
class(L1)
## [1] "test"
#看看L1現在的樣子
L1
## $A
##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1    5    9   13   17
## [2,]    2    6   10   14   18
## [3,]    3    7   11   15   19
## [4,]    4    8   12   16   20
## 
## $B
##      [,1] [,2] [,3]
## [1,] "A"  "A"  "B" 
## [2,] "B"  "C"  "B" 
## [3,] "C"  "B"  "A" 
## 
## $C
## [1]  TRUE FALSE  TRUE FALSE
## 
## attr(,"class")
## [1] "test"

– 小提示:當你使用函數「class()」可以查詢該物件的屬性,若非常見的幾種屬性名稱,那就非常有可能是S3物件(S3 class)或S4物件(S4 class)

第二節:列表(List)層物件基本介紹(5)

#先寫一個自訂函數「print.test()」
print.test = function(test) {
  cat("此列表共有",length(test),"個物件\n")
  cat("物件名稱分別為:\n")
  cat(paste(names(test), collapse = ", "), "\n")
}

#再看看請R列印出L1會變什麼
L1
## 此列表共有 3 個物件
## 物件名稱分別為:
## A, B, C

– 列表(list)的幾個常見函數還是能夠使用:

ls(L1)
## [1] "A" "B" "C"
length(L1)
## [1] 3
class(L1)
## [1] "test"
names(L1) = c("D", "E", "F")
L1
## 此列表共有 3 個物件
## 物件名稱分別為:
## D, E, F

第二節:列表(List)層物件基本介紹(6)

– 在寫之前我們先看看直接對L1使用函數「summary()」會怎樣

summary(L1)
##   Length Class  Mode     
## D 20     -none- numeric  
## E  9     -none- character
## F  4     -none- logical

– 現在我們可以讓函數「summary()」使用後產生不同的結果

#先寫一個自訂函數「summary.test()」
summary.test = function(test) {
  cat("此列表共有",length(test),"個物件\n")
  cat("物件名稱分別為:\n")
  cat(paste(names(test), collapse = ", "), "\n")
  for (i in 1:length(test)) {
    cat(names(test)[i], "之物件屬性為", class(test[[i]]), "\n")
  }
}

#再看看使用函數「summary()」後會變什麼
summary(L1)
## 此列表共有 3 個物件
## 物件名稱分別為:
## D, E, F 
## D 之物件屬性為 matrix 
## E 之物件屬性為 matrix 
## F 之物件屬性為 logical

練習2:S3物件練習

  1. 平均數必須儲存在列表(list)物件的第一個位置,並且必須儲存為一『數値變數』,長度不限

  2. 標準差必須儲存在列表(list)物件的第二個位置,並且必須儲存為一『數値變數』,長度不限

  3. 各組別樣本數必須儲存在列表(list)物件的第三個位置,並且必須儲存為一『數値變數』,長度不限

  4. 列表(list)物件的第四個位置必須放置『平均數±標準差』,並且必須儲存為一『文字變數』,長度不限

  5. 列表(list)物件的第五個位置必須放置當初輸入的類別變項之『各類別名稱』,並且必須儲存為一『文字變數』,長度不限

  6. 在使用函數「print()」時,我想看到連續幾行的『組別:‘XXX’之平均數±標準差為’XXX±XXX’(個案數=XXX)』。

練習2答案

my_func.S3 = function (x, y, dig = 3) {
  
  lvl.x = levels(factor(x))
  n.lvl.x = length(lvl.x)
  
  my_list = list()
  my_list[[1]] = numeric(n.lvl.x)
  my_list[[2]] = numeric(n.lvl.x)
  my_list[[3]] = numeric(n.lvl.x)
  my_list[[4]] = character(n.lvl.x)
  my_list[[5]] = character(n.lvl.x)
  
  names(my_list) = c("mean", "sd", "n", "description", "name")
  
  for (i in 1:n.lvl.x) {
    current_vec = y[x == lvl.x[i]]
    my_list[[1]][i] = mean(current_vec, na.rm = TRUE)
    my_list[[2]][i] = sd(current_vec, na.rm = TRUE)
    my_list[[3]][i] = sum(table(current_vec))
    my_list[[4]][i] = paste0(formatC(my_list[[1]][i], digits = 3, format = "f"), "±", formatC(my_list[[2]][i], digits = 3, format = "f"))
    my_list[[5]][i] = lvl.x[i]
  }
  
  class(my_list) = 'myS3'
  my_list
  
}

print.myS3 = function (x) {
  
  n.lvl.x = length(x[["name"]])
  
  for (i in 1:n.lvl.x) {
    cat(paste0("組別:", x[["name"]][i], "之平均數±標準差為", x[["description"]][i], "(個案數=", x[["n"]][i], ")\n"))
  }
  
}
my_list = my_func.S3(dat$Education, dat$SBP)

ls(my_list)
## [1] "description" "mean"        "n"           "name"        "sd"
my_list
## 組別:0之平均數±標準差為121.365±10.568(個案數=488)
## 組別:1之平均數±標準差為121.406±10.920(個案數=284)
## 組別:2之平均數±標準差為122.853±10.532(個案數=184)

第三節:t test and Wilcoxon test(1)

– 這時候我們需要使用函數「t.test()」及「wilcox.test()」

result1 = t.test(dat[,"eGFR"]~dat[,"Diabetes"], var.equal = FALSE)
result1
## 
##  Welch Two Sample t-test
## 
## data:  dat[, "eGFR"] by dat[, "Diabetes"]
## t = -12.506, df = 254.94, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -7.967643 -5.799633
## sample estimates:
## mean in group 0 mean in group 1 
##        32.87223        39.75587
result2 = wilcox.test(dat[,"eGFR"]~dat[,"Diabetes"])
result2
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  dat[, "eGFR"] by dat[, "Diabetes"]
## W = 29815, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

第三節:t test and Wilcoxon test(2)

class(result1)
## [1] "htest"
class(result2)
## [1] "htest"
ls(result1)
## [1] "alternative" "conf.int"    "data.name"   "estimate"    "method"     
## [6] "null.value"  "parameter"   "p.value"     "statistic"
ls(result2)
## [1] "alternative" "data.name"   "method"      "null.value"  "parameter"  
## [6] "p.value"     "statistic"

第三節:t test and Wilcoxon test(3)

var.result = var.test(dat[,"eGFR"]~dat[,"Diabetes"])
var.result
## 
##  F test to compare two variances
## 
## data:  dat[, "eGFR"] by dat[, "Diabetes"]
## F = 1.0183, num df = 767, denom df = 171, p-value = 0.8997
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.7979204 1.2767309
## sample estimates:
## ratio of variances 
##           1.018323
result3 = t.test(dat[,"eGFR"]~dat[,"Diabetes"], var.equal = TRUE)
result3
## 
##  Two Sample t-test
## 
## data:  dat[, "eGFR"] by dat[, "Diabetes"]
## t = -12.434, df = 938, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -7.970133 -5.797143
## sample estimates:
## mean in group 0 mean in group 1 
##        32.87223        39.75587

練習3:自訂分析流程

  1. 函數之input必須為一『類別變項』及一『連續變項』

  2. 先做一個檢查,如果該『類別變項』之類別數不等於2,則不執行函數或產生警告

  3. 先檢查兩組個案數是否皆>25,如果是就做t test,否則就做Wilcoxon test。

  4. 如果選擇做做t test,先確定變異數是否同質,並依據變異數同值性檢定的結果,決定要使用哪一種t test。

練習3答案

my_func.analysis = function (x, y) {
  
  lvl.x = levels(factor(x))
  n.lvl.x = length(lvl.x)
  
  if (n.lvl.x != 2) {
    
    cat("『類別變項』之類別數不等於2\n")
    
  } else {
    
    n.each = numeric(n.lvl.x)
    
    for (i in 1:n.lvl.x) {
      
      current_vec = y[x == lvl.x[i]]
      n.each[i] = sum(table(current_vec))
      
    }
    
    if (FALSE %in% (n.each > 25)) {
      
      result = wilcox.test(y~x)
      
    } else {
      
      var.result = var.test(y~x)
      
      if (var.result[['p.value']] < 0.05) {
        
        result = t.test(y~x, var.equal = FALSE)
        
      } else {
        
        result = t.test(y~x, var.equal = TRUE)
        
      }
      
    }
    
    result
  
  }
  
}
my_func.analysis(dat$Income, dat$eGFR)
## 『類別變項』之類別數不等於2
my_func.analysis(dat$Disease, dat$eGFR)
## 
##  Welch Two Sample t-test
## 
## data:  y by x
## t = -2.4556, df = 466.06, p-value = 0.01443
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.2043011 -0.2445876
## sample estimates:
## mean in group 0 mean in group 1 
##        33.19008        34.41452
my_func.analysis(dat[dat$Income == 2,]$Disease, dat[dat$Income == 2,]$eGFR)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  y by x
## W = 691, p-value = 0.2357
## alternative hypothesis: true location shift is not equal to 0

第四節:ANOVA and Kruskal-Wallis test(1)

  1. 我們需要使用函數「aov()」及「anova()」執行ANOVA

  2. 函數「kruskal.test()」可以幫助我們做Kruskal-Wallis test

Variance.table = aov(dat[,"eGFR"]~as.factor(dat[,"Education"]))
Variance.table
## Call:
##    aov(formula = dat[, "eGFR"] ~ as.factor(dat[, "Education"]))
## 
## Terms:
##                 as.factor(dat[, "Education"]) Residuals
## Sum of Squares                         213.93  47107.75
## Deg. of Freedom                             2       946
## 
## Residual standard error: 7.056683
## Estimated effects may be unbalanced
## 51 observations deleted due to missingness
ANOVA.table = anova(Variance.table)
ANOVA.table
## Analysis of Variance Table
## 
## Response: dat[, "eGFR"]
##                                Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(dat[, "Education"])   2    214 106.966   2.148 0.1173
## Residuals                     946  47108  49.797
KW.result = kruskal.test(dat[,"eGFR"]~dat[,"Education"])
KW.result
## 
##  Kruskal-Wallis rank sum test
## 
## data:  dat[, "eGFR"] by dat[, "Education"]
## Kruskal-Wallis chi-squared = 4.519, df = 2, p-value = 0.1044

練習4:加強分析流程

  1. 先檢查類別數是否>=3,若是則使用ANOVA或Kruskal-Wallis test,否則則使用t test或Wilcoxon test。

  2. 檢查各組內個案數是否皆>25,如果是就做ANOVA,否則就做Kruskal-Wallis test。

練習4答案

my_func.analysis = function (x, y) {
  
  lvl.x = levels(factor(x))
  n.lvl.x = length(lvl.x)
  
  n.each = numeric(n.lvl.x)
  
  for (i in 1:n.lvl.x) {
    
    current_vec = y[x == lvl.x[i]]
    n.each[i] = sum(table(current_vec))
    
  }
  
  
  if (n.lvl.x > 2) {
    
    if (FALSE %in% (n.each > 25)) {
      
      result = kruskal.test(y~x)

    } else {
      
      Variance.table = aov(y~as.factor(x))
      result = anova(Variance.table)
      
    }
    
  } else {
    
    if (FALSE %in% (n.each > 25)) {
      
      result = wilcox.test(y~x)
      
    } else {
      
      var.result = var.test(y~x)
      
      if (var.result[['p.value']] < 0.05) {
        
        result = t.test(y~x, var.equal = FALSE)
        
      } else {
        
        result = t.test(y~x, var.equal = TRUE)
        
      }
      
    }
    
  }
  
  result
  
}
my_func.analysis(dat$Income, dat$eGFR)
## Analysis of Variance Table
## 
## Response: y
##               Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(x)   2    102  51.016  1.0115 0.3641
## Residuals    944  47613  50.438
my_func.analysis(dat[dat$Income == 2,]$Education, dat[dat$Income == 2,]$eGFR)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  y by x
## Kruskal-Wallis chi-squared = 8.7301, df = 2, p-value = 0.01271

家庭作業

– 以下範例為自變項為糖尿病,依變項為eGFR。

##   Variable Group:0     Group:1     p-value 
## 1 "eGFR"   "32.9±6.6" "39.8±6.5" "<0.001"
.ROUND.p = function(X, p.digits=3) {
  p=rep(NA,length(X))
  for (a in 1:length(X)) {
    if (!is.na(X[a])) {
      if (p.digits==0) {p[a]="<1"} else {
        p[a]=as.character(round(X[a],p.digits))
        if (p[a]=="0") {
          if (p.digits==1) {p[a]="<0.1"} else {
            p[a]="<0."
            for (l in 1:(p.digits-1)) {
              p[a]=paste0(p[a],0)
            }
            p[a]=paste0(p[a],1)  
          }
        } else {
          p[a]=formatC(as.numeric(X[a]), format="f", digits = p.digits)
        }
      }
    }
  }
  return(p)
}

.ROUND.p(0.000518, 2)
## [1] "<0.01"
.ROUND.p(0.000518, 3)
## [1] "0.001"
.ROUND.p(0.000518, 4)
## [1] "0.0005"

小結